require(ggplot2)
library(lmtest)
library(car)
check_violations <- function(sample_size, orig_new_obsv, violation_name, violation_func, sigma_sq, orig_X) {
# ordinary relationship with no violations
if (violation_name == "None") {
X <- orig_X
new_obsv <- orig_new_obsv
eps <- rnorm(sample_size, mean = 0, sd = sqrt(sigma_sq))
return(list(X,new_obsv,eps))
}
if (violation_name == "Nonlinear relationship") {
X <- violation_func(orig_X)
new_obsv <- violation_func(orig_new_obsv)
eps <- rnorm(sample_size, mean = 0, sd = sqrt(sigma_sq))
return(list(X,new_obsv,eps))
}
if (violation_name == "Non-normal errors") {
# gamma distributed errors
if (violation_func == "Gamma") {
X <- orig_X
new_obsv <- orig_new_obsv
eps_alpha <- sigma_sq
eps_beta <- 1
eps <- (rgamma(sample_size, shape = eps_alpha, scale = eps_beta) - eps_alpha * eps_beta) * 2
return(list(X,new_obsv,eps))
}
# poisson distributed errors
if (violation_func == "Poisson") {
X <- orig_X
new_obsv <- orig_new_obsv
lambda <- sigma_sq
eps <- (rpois(sample_size, lambda) - lambda) * 2
return(list(X,new_obsv,eps))
}
}
if (violation_name == "Heterogeneous variances") {
X <- orig_X
new_obsv <- orig_new_obsv
eps <- rnorm(sample_size, mean = 0, sd = violation_func(X))
return(list(X,new_obsv,eps))
}
if (violation_name == "Correlated Errors") {
X <- orig_X
rho <- violation_func
new_obsv <- orig_new_obsv
eps <- rep(0, sample_size)
e.ind <- rnorm(sample_size, mean = 0, sd = (sigma_sq / sqrt(1-rho^2)))
eps[1] <- e.ind[1]
for (i in 2:sample_size) {
eps[i] <- rho * eps[i-1] + e.ind[i]
}
return(list(X,new_obsv,eps))
}
if (violation_name == "Multicolinearity") {
X <- unname(cbind(orig_X[,1],violation_func(orig_X[,1])))
new_obsv<- unname(cbind(orig_new_obsv[,1],violation_func(orig_new_obsv[,1])))
eps <- rnorm(sample_size, mean = 0, sd = sqrt(sigma_sq))
return(list(X,new_obsv,eps))
}
}
# function that will run simulations, violating a specific assumption
run_simulations <- function(num_runs, sample_size, num_new_obsv, violation, violation_func, num_predictors, sigma_sq) {
sim_results <- data.frame(percent_yhat_bad_confint = 0,
percent_intercept_bad_confint = 0,
percent_param_bad_confint = 0,
average_mse = 0,
percent_insignificant_intercept = 0,
percent_insignificant_param = 0)
parameters <- sample.int(10,num_predictors + 1)
for (i in 1:num_runs) {
if (num_predictors == 2) {
X <- cbind(runif(sample_size, 0, 10),runif(sample_size, 30, 40))
new_obsv <- cbind(runif(num_new_obsv, 0, 10),runif(num_new_obsv, 30, 40))
} else {X <- runif(sample_size, 0, 10)
new_obsv <- runif(num_new_obsv, 0, 10)}
# transform our data to violate assumptions of linearity
violated_data <- check_violations(sample_size, new_obsv, violation, violation_func, sigma_sq, X)
violated_X <- violated_data[[1]]
violated_new_obsv <- violated_data[[2]]
eps <- violated_data[[3]]
# TODO: make this not break plot_diagnostics
if (violation == "Multicolinearity") {
X <- violated_X
new_obsv <- violated_new_obsv
}
# add in column of ones to capture beta0 when generating Y and expected value of Y
Y <- (cbind(1,violated_X) %*% parameters) + eps
exp_val_Y <- cbind(1,violated_new_obsv) %*% parameters
# fit a linear model using the simulated data
lm_data <- data.frame(Y = Y, X = I(X))
my_lm <- lm(Y ~ X, data=lm_data)
# predict y for new values of x using the fitted linear model
Y_hat_ci <- predict(my_lm, newdata = data.frame(X = I(new_obsv)), interval = "confidence")
# check whether the confidence interval actually captures the expected value of y from the true model
n_bad_predictions <- num_new_obsv - sum((exp_val_Y >= Y_hat_ci[,2]) & (exp_val_Y <= Y_hat_ci[,3]))
# freq of confint for prediction of new data points not capturing the expected val of Y based on ground truth
sim_results$percent_yhat_bad_confint <- sim_results$percent_yhat_bad_confint + n_bad_predictions
# get confidence iterval for beta hat
beta_ci <- confint(my_lm)
# check whether the true value for the betas is caputured in the confidence interval
good_betas <- ((parameters >= beta_ci[,"2.5 %"]) & (parameters <= beta_ci[,"97.5 %"]))
if (good_betas[1] == FALSE)
# how many times true beta_0 was not captured by the confint for our fitted model
sim_results$percent_intercept_bad_confint <- sim_results$percent_intercept_bad_confint + 1
# good_betas[-1] means anything that is not the intercept coeff
if (sum(good_betas[-1] == FALSE) >= 1)
# bad predictor rate: how many times a parameter that was not b0 was not within the confint
# if there's more than one they are combined
{sim_results$percent_param_bad_confint <- sim_results$percent_param_bad_confint + sum(good_betas[-1] == FALSE)}
# add the mse to the sum of MSE for all runs (to be averaged at the end)
sim_results$average_mse <- sim_results$average_mse + (anova(my_lm)$'Mean Sq'[2]) # [2] for residual
# check which beta_hats are significant based on the p-values
p_vals <- summary(my_lm)$coeff[,4] # 4 for pvals
if (p_vals[1] > .05)
# frequency % p-val(intercept) was >0.05
sim_results$percent_insignificant_intercept <- sim_results$percent_insignificant_intercept + 1
if (sum(p_vals[-1] > .05) >= 1)
# excluding intercept coeff, freq that params for predictors had p-val > 0.05 in all our sims. on interval (0,1)
# sum in case more than one predictor var
{sim_results$percent_insignificant_param <- sim_results$percent_insignificant_param + sum(p_vals[-1] > .05)}
}
# FINALLY, take mean of all these
sim_results <- sim_results/num_runs
# confint for predictions did not capture expected value, divided by n new data points
sim_results$percent_yhat_bad_confint <- sim_results$percent_yhat_bad_confint/(num_new_obsv)
# conint for predictor param did not capture true param used to generate data, divited by number of predictors
sim_results$percent_param_bad_confint <- sim_results$percent_param_bad_confint/(num_predictors)
# precent of (non-intercept) params that whose p-val > 0.05
sim_results$percent_insignificant_param <- sim_results$percent_insignificant_param/(num_predictors)
# R is terrible and we have to return our objects as a list >:C
payload <- list(sim_results=sim_results, my_lm=my_lm, X=X, Y=Y, eps=eps, Y_hat_ci=Y_hat_ci, exp_val_Y=exp_val_Y, n=sample_size, num_predictors=num_predictors)
return(payload)
}
plot_diagnostics <- function(payload) {
# unpack payload
list2env(payload, envir=environment())
if(num_predictors > 1){
multivariate_dataframe <- as.data.frame(cbind(X, Y))
colnames(multivariate_dataframe) <- c("X1", "X2", "Y")
multivariate_linear_model <- lm(Y~., data = multivariate_dataframe)
writeLines("Diagnostic #1: Is the Regression Function Linear? ")
plot(multivariate_linear_model, which = 1, caption = "Residuals vs Fitted")
writeLines("Diagnostic #2: Do the Residuals Have Constant Variance?\n\n Does the plot appear to be a general cloud of points or is there a megaphone shape?")
plot(multivariate_linear_model, which = 1)
writeLines("Diagnostic #3: Are the Residuals Normally Distributed?")
plot(multivariate_linear_model, which = 2)
writeLines("Diagnostic #4: Are the Residuals Independent?")
plot(1:n, residuals(multivariate_linear_model))
writeLines("Diagnostic #5: Is there Evidence of Multicollinearity?")
pairs(multivariate_dataframe)
} else {
writeLines("Diagnostic #1: Is the Regression Function Linear? ")
plot(Y ~ X)
abline(my_lm)
writeLines("Diagnostic #2: Do the Residuals Have Constant Variance?\n\n Does the plot appear to be a general cloud of points or is there a megaphone shape?")
plot(my_lm, which = 1)
writeLines("Diagnostic #3: Are the Residuals Normally Distributed?")
plot(my_lm, which = 2)
writeLines("Diagnostic #4: Are the Residuals Independent?")
plot(1:n, residuals(my_lm))}
}
summary_diagnostics <- function(payload) {
# unpack payload
list2env(payload, envir=environment())
print(summary(my_lm))
print(anova(my_lm))
}
advanced_diagnostics <- function(payload) {
# unpack payload
list2env(payload, envir=environment())
#check for multivariate case
if(num_predictors > 1){
multivariate_dataframe <- as.data.frame(cbind(X, Y))
colnames(multivariate_dataframe) <- c("X1", "X2", "Y")
multivariate_linear_model <- lm(Y~., data = multivariate_dataframe)
#constant var
writeLines("Diagnostic #2: Do the Residuals Have Constant Variance?\n\n Does the plot appear to be a general cloud of points or is there a megaphone shape?")
writeLines("Large values of the test statistic lead to us rejecting H0 and concluding that we have nonconstant variance (ie heteroskedacity).")
print(bptest(multivariate_linear_model))
#residuals nonnormal
writeLines("Diagnostic #3: Are the Residuals Normally Distributed?")
# plot(linear_model, which = 2)
print(shapiro.test(residuals(multivariate_linear_model)))
#multicollinearity
writeLines("Diagnostic #5: Is there Evidence of Multicollinearity?")
writeLines("Here are some rough guidelines:
VIF_j≈1: no evidence of multicollinearity for x_j
1<VIF_j<5: evidence of moderate multicollinearity for x_j
5≤VIF_j<10: evidence of strong multicollinearity for x_j
VIF_j≥10 evidence of severe multicollinearity for x_j, will almost certainly cause problems")
print(vif(multivariate_linear_model))
} else {
#bivariate case
# constant variance
writeLines("Diagnostic #2: Do the Residuals Have Constant Variance?")
# heteroskedacity
writeLines("Breusch-Pagan Test\n\nA p-value less than .05 will lead us to reject H0 and conclude that we have nonconstant variance (ie heteroskedacity).")
print(bptest(Y ~ X))
#normally dist residuals
writeLines("Diagnostic #3: Are the Residuals Normally Distributed?")
writeLines("Shapiro-Wilk's Test\n\nA p-value less than .05 will lead us to reject H0 and conclude that the errors are not normally distributed.")
print(shapiro.test(residuals(my_lm)))
}
}
random_sim <- function(n,random_violation,random_violation_func,num_predictors) {
random_simulation <- run_simulations(num_runs = 1,
n,
num_new_obsv = 20,
random_violation,
random_violation_func,
num_predictors,
sigma_sq = 4)
return(random_simulation)
}
We predict that the smaller models will be more likely to be affected by a nonlinear relationship. With large sample sizes, the normality assumption is not critical unless you are predicting new observations. But with smaller sample sizes, a violation of nonnomrality is more likely to affect the model. And since we only have 10 observations in the smaller simluation, the nonnormality assumption is especially difficult to detect because we expect random variation in small sample sizes anyways.
Aside: Many of the issues you end up facing with a nonlinear relationship can also be seen if an important predictor is excluded from the model. If you have extra time, feel free to play with this issue as well (optional).
# set up constants for simulations
set.seed(123)
n_small <- 20
n_med <- 100
n_large <- 5000
num_runs <- 100
num_new_obsv <- 20
# num_predictors <- 1
sigma_sq <- 4
poly_func <- function(x) (x-5)^2
poly_small_results <- run_simulations(num_runs,
n_small,
num_new_obsv,
violation = "Nonlinear relationship",
violation_func = poly_func,
num_predictors = 1,
sigma_sq)
poly_med_results <-run_simulations(num_runs,
n_med,
num_new_obsv,
violation = "Nonlinear relationship",
violation_func = poly_func,
num_predictors = 1,
sigma_sq)
poly_large_results <- run_simulations(num_runs,
n_large,
num_new_obsv,
violation = "Nonlinear relationship",
violation_func = poly_func,
num_predictors = 1,
sigma_sq)
exp_small_results <- run_simulations(num_runs,
n_small,
num_new_obsv,
violation = "Nonlinear relationship",
violation_func = exp,
num_predictors = 1,
sigma_sq)
exp_med_results <-run_simulations(num_runs,
n_med,
num_new_obsv,
violation = "Nonlinear relationship",
violation_func = exp,
num_predictors = 1,
sigma_sq)
exp_large_results <-run_simulations(num_runs,
n_large,
num_new_obsv,
violation = "Nonlinear relationship",
violation_func = exp,
num_predictors = 1,
sigma_sq)
sin_small_results <- run_simulations(num_runs,
n_small,
num_new_obsv,
violation = "Nonlinear relationship",
violation_func = sin,
num_predictors = 1,
sigma_sq)
sin_med_results <- run_simulations(num_runs,
n_med,
num_new_obsv,
violation = "Nonlinear relationship",
violation_func = sin,
num_predictors = 1,
sigma_sq)
sin_large_results <- run_simulations(num_runs,
n_large,
num_new_obsv,
violation = "Nonlinear relationship",
violation_func = sin,
num_predictors = 1,
sigma_sq)
log_small_results <- run_simulations(num_runs,
n_small,
num_new_obsv,
violation = "Nonlinear relationship",
violation_func = log,
num_predictors = 1,
sigma_sq)
log_med_results <- run_simulations(num_runs,
n_med,
num_new_obsv,
violation = "Nonlinear relationship",
violation_func = log,
num_predictors = 1,
sigma_sq)
log_large_results <- run_simulations(num_runs,
n_large,
num_new_obsv,
violation = "Nonlinear relationship",
violation_func = log,
num_predictors = 1,
sigma_sq)
TODO template for part E
We used R to choose whether to simulate data from a true linear model or a true nonlinear model. And then we simulated data accordingly and displayed diagnostics as appropritate.
From this simulation, we can tell that a linear function was generated.
Based on the diagnostics, we predict that the problem areas that we mentioned in part d will/will not be affected.
(polynomial_results <- rbind(small_samp = poly_small_results[[1]],
med_samp = poly_med_results[[1]],
large_samp = poly_large_results[[1]]))
(exponential_results <- rbind(small_samp = exp_small_results[[1]],
med_samp = exp_med_results[[1]],
large_samp = exp_large_results[[1]]))
(sinusoidal_results <- rbind(small_samp = sin_small_results[[1]],
med_samp = sin_med_results[[1]],
large_samp = sin_large_results[[1]]))
(logarithmic_results <- rbind(small_samp = log_small_results[[1]],
med_samp = log_med_results[[1]],
large_samp = log_large_results[[1]]))
Prediction confidence intervals
Each different type of non-linear relationship severely affected the quality of the predictions, and the quality continued to decrease as sample size increased.
Confidence interval for \(\beta_0\)
The confidence interval for the intercept failed to capture the true intercept for most of the simulations and it’s accuracy in capturing it decreased significantly as the sample size increased. For each type of non-linear relationship we tested, the confidence interval for \(\beta_0\) failed to capture the true value of the intercept for every single simulation when the sample size was set to 5000.
Confidence inverval for \(\beta_1\)
The confidence interval for \(\beta_1\) performed even more poorly than the CI for the intercept. The vast majority of the simulations failed to capture the true value.
Average MSE
The average MSE for each non-linear relationship tested was wildly inaccurate as an estimator of the variance of the error, \(\sigma^2\), which was set to be 4 for each simulation.
P-values for \(\beta_0\) and \(\beta_1\)
The p-value for the intercept was usually found to be significant (less than \(\alpha = .05\)), regardless of the underlying non-linear relationship, especially if the sample size was high. In the exponentail and logarithmic simulations, the coefficent \(\beta_1\) was also usually significant, but the polynomial and sinusoidal simulations usually found that predictor to be insignificant.
TODO write interpretation
#part e, name the normal func & nonnormal func & randomly call it
q1_random_func <- sample(c(poly_func, sin, log, exp), 3)
n <- sample(20:5000, 1)
q1_random_violation_1 <- sample(c("None","Nonlinear relationship"), 1)
q1_random_results_1 <- random_sim(n,
random_violation = q1_random_violation_1,
random_violation_func = q1_random_func[[1]],
num_predictors = 1)
q1_random_violation_2 <- sample(c("None","Nonlinear relationship"), 1)
q1_random_results_2 <- random_sim(n,
random_violation = q1_random_violation_2,
random_violation_func = q1_random_func[[2]],
num_predictors = 1)
q1_random_violation_3 <- sample(c("None","Nonlinear relationship"), 1)
q1_random_results_3 <- random_sim(n,
random_violation = q1_random_violation_3,
random_violation_func = q1_random_func[[3]],
num_predictors = 1)
plot_diagnostics(q1_random_results_1)
## Diagnostic #1: Is the Regression Function Linear?
## Diagnostic #2: Do the Residuals Have Constant Variance?
##
## Does the plot appear to be a general cloud of points or is there a megaphone shape?
## Diagnostic #3: Are the Residuals Normally Distributed?
## Diagnostic #4: Are the Residuals Independent?
plot_diagnostics(q1_random_results_2)
## Diagnostic #1: Is the Regression Function Linear?
## Diagnostic #2: Do the Residuals Have Constant Variance?
##
## Does the plot appear to be a general cloud of points or is there a megaphone shape?
## Diagnostic #3: Are the Residuals Normally Distributed?
## Diagnostic #4: Are the Residuals Independent?
plot_diagnostics(q1_random_results_3)
## Diagnostic #1: Is the Regression Function Linear?
## Diagnostic #2: Do the Residuals Have Constant Variance?
##
## Does the plot appear to be a general cloud of points or is there a megaphone shape?
## Diagnostic #3: Are the Residuals Normally Distributed?
## Diagnostic #4: Are the Residuals Independent?
The first two simulations show a linear relationship in the scatter plot, but the the third simulation shows a distinctly non-linear relationship (most likely a sinusoidal relationship). From these plots we would expect that the fitted model from the third simulations would lead to inaccurate coefficients, inaccurate predictions, and an MSE that does not estimate \(\sigma^2\) well.
q1_random_results_3[[1]]
gamma_small_results <- run_simulations(num_runs,
n_small,
num_new_obsv,
violation = "Non-normal errors",
violation_func = "Gamma",
num_predictors = 1,
sigma_sq)
gamma_med_results <- run_simulations(num_runs,
n_med,
num_new_obsv,
violation = "Non-normal errors",
violation_func = "Gamma",
num_predictors = 1,
sigma_sq)
gamma_large_results <- run_simulations(num_runs,
n_large,
num_new_obsv,
violation = "Non-normal errors",
violation_func = "Gamma",
num_predictors = 1,
sigma_sq)
poisson_small_results <- run_simulations(num_runs,
n_small,
num_new_obsv,
violation = "Non-normal errors",
violation_func = "Poisson",
num_predictors = 1,
sigma_sq)
poisson_med_results <- run_simulations(num_runs,
n_med,
num_new_obsv,
violation = "Non-normal errors",
violation_func = "Poisson",
num_predictors = 1,
sigma_sq)
poisson_large_results <- run_simulations(num_runs,
n_large,
num_new_obsv,
violation = "Non-normal errors",
violation_func = "Poisson",
num_predictors = 1,
sigma_sq)
(gamma_results <- rbind(small_samp = gamma_small_results[[1]],
med_samp = gamma_med_results[[1]],
large_samp = gamma_large_results[[1]]))
(poisson_results <- rbind(small_samp = poisson_small_results[[1]],
med_samp = poisson_med_results[[1]],
large_samp = poisson_large_results[[1]]))
Prediction confidence intervals
The confidence interval for the predictions performed very well. They failed about 5% of the time, which is to be expected since it is a 95% confidence interval.
Confidence interval for \(\beta_0\)
The confidence interval for the intercept performed very well, also failing about for about 5% of the simulations.
Confidence inverval for \(\beta_1\)
The confidence interval for \(\beta_1\) performed very well, also failing about for about 5% of the simulations.
Average MSE
The variance for the errors was always set to 4, regardless of which distribution was used to generate them. Both simulations using either the Gamma or Poisson distributed errors resulted in an average MSE of approximately 16, which is significantly higher than the true variance.
P-values for \(\beta_0\) and \(\beta_1\)
The simulations using the small sample size of 20 would often find the intercept to be insignificant. However, the coefficient for the predictor was always found to be significant, regardless of sample size or the distribution used to generate the errors.
TODO write interpretation
#part e, name the normal func & nonnormal func & randomly call it
q2_random_func <- sample(c("Poisson","Gamma"), 2)
n <- sample(20:5000, 1)
q2_random_violation <- sample(c("None","Non-normal errors"), 2)
q2_random_results_1 <- random_sim(n,
random_violation = q2_random_violation[1],
random_violation_func = q2_random_func[1],
num_predictors = 1)
q2_random_results_2 <- random_sim(n,
random_violation = q2_random_violation[2],
random_violation_func = q2_random_func[2],
num_predictors = 1)
plot_diagnostics(q2_random_results_1)
## Diagnostic #1: Is the Regression Function Linear?
## Diagnostic #2: Do the Residuals Have Constant Variance?
##
## Does the plot appear to be a general cloud of points or is there a megaphone shape?
## Diagnostic #3: Are the Residuals Normally Distributed?
## Diagnostic #4: Are the Residuals Independent?
plot_diagnostics(q2_random_results_2)
## Diagnostic #1: Is the Regression Function Linear?
## Diagnostic #2: Do the Residuals Have Constant Variance?
##
## Does the plot appear to be a general cloud of points or is there a megaphone shape?
## Diagnostic #3: Are the Residuals Normally Distributed?
## Diagnostic #4: Are the Residuals Independent?
The plots for the first simulation look fine and we expect that the errors from that model were normally distributed. The plots for the second simulation definitely reveal that an assumption about the errors was violated and while the model may still be able to make decent predictions, we could not use the MSE to accurately describe the true variance of the system.
q2_random_violation[2] == "Non-normal errors"
## [1] TRUE
poly_func_2 <- function(x) 0.5*x^2
hetero_var_poly_small_results <- run_simulations(num_runs,
n_small,
num_new_obsv,
violation = "Heterogeneous variances",
violation_func = poly_func_2,
num_predictors = 1,
sigma_sq)
hetero_var_poly_med_results <-run_simulations(num_runs,
n_med,
num_new_obsv,
violation = "Heterogeneous variances",
violation_func = poly_func_2,
num_predictors = 1,
sigma_sq)
hetero_var_poly_large_results <-run_simulations(num_runs,
n_large,
num_new_obsv,
violation = "Heterogeneous variances",
violation_func = poly_func_2,
num_predictors = 1,
sigma_sq)
hetero_var_exp_small_results <- run_simulations(num_runs,
n_small,
num_new_obsv,
violation = "Heterogeneous variances",
violation_func = exp,
num_predictors = 1,
sigma_sq)
hetero_var_exp_med_results <-run_simulations(num_runs,
n_med,
num_new_obsv,
violation = "Heterogeneous variances",
violation_func = exp,
num_predictors = 1,
sigma_sq)
hetero_var_exp_large_results <-run_simulations(num_runs,
n_large,
num_new_obsv,
violation = "Heterogeneous variances",
violation_func = exp,
num_predictors = 1,
sigma_sq)
hetero_var_poly_results <- rbind(small_samp = hetero_var_poly_small_results[[1]],
med_samp = hetero_var_poly_med_results[[1]],
large_samp = hetero_var_poly_large_results[[1]])
hetero_var_poly_results
hetero_var_exp_results <- rbind(small_samp = hetero_var_exp_small_results[[1]],
med_samp = hetero_var_exp_med_results[[1]],
large_samp = hetero_var_exp_large_results[[1]])
hetero_var_exp_results
Prediction confidence intervals
While the confidence interval for the predictions did not perform as poorly as they did for some of the other simulations, they do fail to capture the true expected value of the response variable more than 5% of the time. The rate of bad predictions does descrease as sample size increases.
Confidence interval for \(\beta_0\)
The CI for the intercept would almost always capture the true value of the intercept, which may indicate that the interval was very large since it almost never failed.
Confidence inverval for \(\beta_1\)
The CI for \(\beta_1\) failed more often than it did for the intercept, but no more often than 25% of the time. The failure rate also decreased as sample size increased.
Average MSE
The average MSE is extremely inaccurate as a estimator for \(\sigma^2\), which is not surprising since we are violating one of the assumptions about the error term.
P-values for \(\beta_0\) and \(\beta_1\)
The coefficient for the intercept was found to insignificant quite often, especially when the variance had an exponential relationship with the predictor. When the variance had a polynomial relationship with the predictor, the intercept was always found to be significant when the sample size was large. The coefficient of the predictor was found to be insignificant less often than the intercept but often enough to suggest that the fitted linear model is not an accurate representation of the true model.
TODO write interpretation
#part e, name the normal func & nonnormal func & randomly call it
random_poly_func <- function(x) {
a <- runif(1,0,5)
b <- runif(1,2,4)
return((x-a)^b)
}
q3_random_func <- sample(c(random_poly_func, exp),1)
n <- n_med
q3_random_violation <- sample(c("None","Heterogeneous variances"), 2)
q3_random_results_1 <- random_sim(n,
random_violation = q3_random_violation[1],
random_violation_func = q3_random_func[[1]],
num_predictors = 1)
q3_random_results_2 <- random_sim(n,
random_violation = q3_random_violation[2],
random_violation_func = q3_random_func[[1]],
num_predictors = 1)
plot_diagnostics(q3_random_results_1)
## Diagnostic #1: Is the Regression Function Linear?
## Diagnostic #2: Do the Residuals Have Constant Variance?
##
## Does the plot appear to be a general cloud of points or is there a megaphone shape?
## Diagnostic #3: Are the Residuals Normally Distributed?
## Diagnostic #4: Are the Residuals Independent?
plot_diagnostics(q3_random_results_2)
## Diagnostic #1: Is the Regression Function Linear?
## Diagnostic #2: Do the Residuals Have Constant Variance?
##
## Does the plot appear to be a general cloud of points or is there a megaphone shape?
## Diagnostic #3: Are the Residuals Normally Distributed?
## Diagnostic #4: Are the Residuals Independent?
The plots for the second simulation definitely show that variance of the errors is not constant since the residuals vs fitted values plot shows a megaphone shape instead of a general cloud of points. We would expect that the quality of predictions is low and that the MSE does not represent the true variance of the system since it is obviously not a constant value.
q3_random_violation[2] == "Heterogeneous variances"
## [1] TRUE
# Sample code to get started
corr_error_small_results_h <- run_simulations(num_runs,
n_small,
num_new_obsv,
violation = "Correlated Errors",
violation_func = 0.9,
num_predictors = 1,
sigma_sq)
corr_error_med_results_h <-run_simulations(num_runs,
n_med,
num_new_obsv,
violation = "Correlated Errors",
violation_func = 0.9,
num_predictors = 1,
sigma_sq)
corr_error_large_results_h <-run_simulations(num_runs,
n_large,
num_new_obsv,
violation = "Correlated Errors",
violation_func = 0.9,
num_predictors = 1,
sigma_sq)
#correlated errors with rho of .3
corr_error_small_results_l <- run_simulations(num_runs,
n_small,
num_new_obsv,
violation = "Correlated Errors",
violation_func = 0.3,
num_predictors = 1,
sigma_sq)
corr_error_med_results_l <-run_simulations(num_runs,
n_med,
num_new_obsv,
violation = "Correlated Errors",
violation_func = 0.3,
num_predictors = 1,
sigma_sq)
corr_error_large_results_l <-run_simulations(num_runs,
n_large,
num_new_obsv,
violation = "Correlated Errors",
violation_func = 0.3,
num_predictors = 1,
sigma_sq)
corr_error_results_h <- rbind(small_samp = corr_error_small_results_h[[1]],
med_samp = corr_error_med_results_h[[1]],
large_samp = corr_error_large_results_h[[1]])
corr_error_results_h
corr_error_results_l <- rbind(small_samp = corr_error_small_results_l[[1]],
med_samp = corr_error_med_results_l[[1]],
large_samp = corr_error_large_results_l[[1]])
corr_error_results_l
I compared different sample sizes with different values of Rho, the level of correlation in the error terms. After running the simulations, several tasks were violated. Most notably, the value of the average MSE, which estimates the variance, in each case was greater than the true value of \(\sigma^2\)= 4. When Rho = .9, the average MSE was far greater and although not by much, when Rho =.2 the average MSE was greater as well. In the case where Rho was set to .9, the true value of \(\hat{y}\) was missing in over half of the simulations’ confidence intervals. Similarly, the ratio of simulations that failed to capture the true intercept was greater than 30% at each sample size level. Another violation appeared in the ratio of simulations that found the intercept to be insignificant, in the small and medium sample sizes this condition was violated over 39% of the time. Despite these violations, the ratio of simulations that failed to include the true value of \(\beta_{1}\) was very low, less than 10%. In addition, none of the simulations considered \(\beta_{1}\) to be insignificant. Strangely, the simulations with a value of Rho = .2 and a small sample size failed to capture the true \(\beta_{0}\) over 70% of the time.
# randomly select a value of rho equal to .8 or 0 from a large sample size
set.seed(1234)
rho_vec <- c(0.1, .8)
#choose random value of rho
rho <- sample(rho_vec,1)
b0 <- 10
b1 <- 2
sigma <- 4
n <- 500
x <- runif(n, 0, 10)
newx <- runif(n,0,10)
eps <- rep(0, n)
e.ind <- rnorm(n, mean = 0, sd = (sigma / sqrt(1 - rho^2)))
eps[1] <- e.ind[1]
for (i in 2:n) {
eps[i] <- rho * eps[i-1] + e.ind[i]
}
y <- b0 + b1 * x + eps
lm.random <- lm(y~x)
plot(lm.random)
plot(1:n, residuals(lm.random))
Given the plot of the residuals, there doesn’t appear to be a correlation in the error terms. Therefore, the violations from before will not be affected and the linear model can be applied.
add_noise <- function(x) x + rnorm(length(x))
multicolinearity_noise_small_results <- run_simulations(num_runs,
n_small,
num_new_obsv,
violation = "Multicolinearity",
violation_func = add_noise,
num_predictors = 2,
sigma_sq)
multicolinearity_noise_med_results <- run_simulations(num_runs,
n_med,
num_new_obsv,
violation = "Multicolinearity",
violation_func = add_noise,
num_predictors = 2,
sigma_sq)
multicolinearity_noise_large_results <- run_simulations(num_runs,
n_large,
num_new_obsv,
violation = "Multicolinearity",
violation_func = add_noise,
num_predictors = 2,
sigma_sq)
quadratic <- function(x) x^2
multicolinearity_poly_small_results <- run_simulations(num_runs,
n_small,
num_new_obsv,
violation = "Multicolinearity",
violation_func = quadratic,
num_predictors = 2,
sigma_sq)
multicolinearity_poly_med_results <- run_simulations(num_runs,
n_med,
num_new_obsv,
violation = "Multicolinearity",
violation_func = quadratic,
num_predictors = 2,
sigma_sq)
multicolinearity_poly_large_results <- run_simulations(num_runs,
n_large,
num_new_obsv,
violation = "Multicolinearity",
violation_func = quadratic,
num_predictors = 2,
sigma_sq)
multicolinearity_noise_results <- rbind(small_samp = multicolinearity_noise_small_results[[1]],
med_samp = multicolinearity_noise_med_results[[1]],
large_samp = multicolinearity_noise_large_results[[1]])
multicolinearity_noise_results
multicolinearity_poly_results <- rbind(small_samp = multicolinearity_poly_small_results[[1]],
med_samp = multicolinearity_poly_med_results[[1]],
large_samp = multicolinearity_poly_large_results[[1]])
multicolinearity_poly_results
Prediction confidence intervals
The confidence intervals for the predictions performed as expected when no assumptions are violated, where the expected value of Y based on the true model was captured within the confidence interval about 95% of the time.
Confidence interval for \(\beta_0\)
The confidence intervals for the intercept performed as expected when no assumptions are violated, where the true value of the intercept was captured within the confidence interval about 95% of the time.
Confidence inverval for \(\beta_1\)
The confidence intervals for \(\beta_1\) performed as expected when no assumptions are violated, where the true value of \(\beta_1\) was captured within the confidence interval about 95% of the time.
Average MSE
Of all the simulations run, these had the best behaved values for the MSE, where the average MSE of all the simulations was approximately the true value of \(\sigma^2 = 4.\)
P-values for \(\beta_0\) and \(\beta_1\) The simulations where X2 was simply X1 with some random noise added, always found both the intercept and \(\beta_1\) to be significant, regardless of sample size. The simulation where X2 had some polynomial relationship with X1, the intercept was often found to be ingsignificant and occasionally \(\beta_1\) was also found to be insignificant. However, both of those rates went to zero as sample size increased.
## PART E TODO write interpretation
#part e, name the normal func & nonnormal func & randomly call it
n <- n_med
q5_random_func <- sample(c(add_noise,quadratic),2)
q5_random_violation <- sample(c("None","Multicolinearity"), 2)
q5_random_results_1 <- random_sim(n,
random_violation = q5_random_violation[1],
random_violation_func = q5_random_func[[1]],
num_predictors = 2)
q5_random_results_2 <- random_sim(n,
random_violation = q5_random_violation[2],
random_violation_func = q5_random_func[[2]],
num_predictors = 2)
# random_results5[[1]]
plot_diagnostics(q5_random_results_1)
## Diagnostic #1: Is the Regression Function Linear?
## Diagnostic #2: Do the Residuals Have Constant Variance?
##
## Does the plot appear to be a general cloud of points or is there a megaphone shape?
## Diagnostic #3: Are the Residuals Normally Distributed?
## Diagnostic #4: Are the Residuals Independent?
## Diagnostic #5: Is there Evidence of Multicollinearity?
plot_diagnostics(q5_random_results_2)
## Diagnostic #1: Is the Regression Function Linear?
## Diagnostic #2: Do the Residuals Have Constant Variance?
##
## Does the plot appear to be a general cloud of points or is there a megaphone shape?
## Diagnostic #3: Are the Residuals Normally Distributed?
## Diagnostic #4: Are the Residuals Independent?
## Diagnostic #5: Is there Evidence of Multicollinearity?
#check our answers